source("utils.R")Assemble environment and DOC data.
We will create 5 datasets, one for each prediction task:
surface annual
surface seasonal
epipelagic annual
mesopelagic annual
bathypelagic annual
Load data
load("data/00.all_env.Rdata")
load("data/01.doc_data.Rdata")
# Load layer inclusion
load("data/00.bathymetry_inc.Rdata")
# Clean names data: replace . by _
df_env <- df_env %>% rename_all(~ sub("\\.", "_", .x))
df_doc <- df_doc %>% rename_all(~ sub("\\.", "_", .x))Join
Surface annual
## New predictions data
# Env data in the surface layer, annual, to be used for new predictions. NB: this requires droping any row with missing values
df_ann_surf_pred <- df_env %>%
filter(season == 0) %>%
select(-contains(c("epi", "meso", "bathy"))) %>%
left_join(df_inc %>% select(lon, lat, surf_include), by = join_by(lon, lat)) %>%
filter(surf_include) %>%
select(-surf_include)
# Detect pixels where less that 1/4 of predictors are missing
sel_pix <- df_ann_surf_pred %>%
pivot_longer(temperature_surf:par, names_to = "variable", values_to = "value") %>%
group_by(lon, lat, season) %>%
summarise_all(list(percent_na)) %>%
ungroup() %>%
rename(percent_na = value) %>%
filter(percent_na < 0.10) %>%
select(lon, lat, season) %>%
mutate(use = TRUE)
# Keep only these pixels
df_ann_surf_pred <- df_ann_surf_pred %>%
left_join(sel_pix, by = join_by(lon, lat, season)) %>%
filter(use) %>%
select(-use)
## Model fitting data
# Join env data with doc data, drop missing observations, apply log transformation to doc value
df_ann_surf_fit <- df_doc %>%
filter(season == 0) %>%
select(-contains(c("epi", "meso", "bathy"))) %>%
left_join(df_ann_surf_pred, by = join_by(lon, lat, season)) %>%
drop_na() %>%
mutate(log_doc_surf = log(doc_surf), .before = doc_surf)Plot map of coverage.
# Plot coverage of data
# points are model fitting data
# blue rectangles are the prediction area
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_ann_surf_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_ann_surf_fit, aes(x = lon, y = lat, colour = log_doc_surf), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
ggtitle("Surface annual")Plot doc distributions, after log-transformation
ggplot(df_ann_surf_fit) + geom_histogram(aes(x = log_doc_surf)) + ggtitle("Surface annual")Even with log-transformation, we still have a long-tail distribution. Let’s check where are these high values.
ggplot(df_ann_surf_fit) +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_point(aes(x = lon, y = lat, colour = log_doc_surf > 5), size = 0.5) +
coord_quickmap(expand = 0) +
ggtitle("Surface annual")Surface seasonal
## New predictions data
# Env data in the surface layer, annual, to be used for new predictions. NB: this requires droping any row with missing values
df_seas_surf_pred <- df_env %>%
filter(season != 0) %>%
select(-contains(c("epi", "meso", "bathy"))) %>%
left_join(df_inc %>% select(lon, lat, surf_include), by = join_by(lon, lat)) %>%
filter(surf_include) %>%
select(-surf_include)
# Detect pixels where less that 1/4 of predictors are missing
sel_pix <- df_seas_surf_pred %>%
pivot_longer(temperature_surf:par, names_to = "variable", values_to = "value") %>%
group_by(lon, lat, season) %>%
summarise_all(list(percent_na)) %>%
ungroup() %>%
rename(percent_na = value) %>%
filter(percent_na < 0.10) %>%
select(lon, lat, season) %>%
mutate(use = TRUE)
# Keep only these pixels
df_seas_surf_pred <- df_seas_surf_pred %>%
left_join(sel_pix, by = join_by(lon, lat, season)) %>%
filter(use) %>%
select(-use)
## Model fitting data
# Join env data with doc data, drop missing observations, apply log transformation to doc value
df_seas_surf_fit <- df_doc %>%
filter(season != 0) %>%
select(-contains(c("epi", "meso", "bathy"))) %>%
left_join(df_seas_surf_pred, by = join_by(lon, lat, season)) %>%
drop_na() %>%
mutate(log_doc_surf = log(doc_surf), .before = doc_surf)Plot map of coverage.
# Plot coverage of data
# points are model fitting data
# blue rectangles are the prediction area
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_seas_surf_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_seas_surf_fit, aes(x = lon, y = lat, colour = log_doc_surf), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
facet_wrap(~season) +
ggtitle("Surface seasonal")A few very high values in autumn.
Plot doc distributions, after log-transformation
ggplot(df_seas_surf_fit) + geom_histogram(aes(x = log_doc_surf)) + facet_wrap(~season) + ggtitle("Surface seasonal")Even with log-transformation, we still have a long-tail distribution, especially in autumn. Let’s check where are these high values.
ggplot(df_seas_surf_fit) +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_point(aes(x = lon, y = lat, colour = log_doc_surf > 5), size = 0.5) +
coord_quickmap(expand = 0) +
facet_wrap(~season) +
ggtitle("Surface seasonal")Epipelagic annual
## New predictions data
# Env data in the surface + epipelagic layer, annual, to be used for new predictions. NB: this requires droping any row with missing values
df_ann_epi_pred <- df_env %>%
filter(season == 0) %>%
select(-contains(c("meso", "bathy"))) %>%
filter(if_any(contains("epi"), ~!is.na(.))) %>%
left_join(df_inc %>% select(lon, lat, epi_include), by = join_by(lon, lat)) %>%
filter(epi_include) %>%
select(-epi_include)
# Detect pixels where less that 1/4 of predictors are missing
sel_pix <- df_ann_epi_pred %>%
pivot_longer(temperature_surf:nitrate_epi, names_to = "variable", values_to = "value") %>%
group_by(lon, lat, season) %>%
summarise_all(list(percent_na)) %>%
ungroup() %>%
rename(percent_na = value) %>%
filter(percent_na < 0.10) %>%
select(lon, lat, season) %>%
mutate(use = TRUE)
# Keep only these pixels
df_ann_epi_pred <- df_ann_epi_pred %>%
left_join(sel_pix, by = join_by(lon, lat, season)) %>%
filter(use) %>%
select(-use)
## Model fitting data
# Join env data with doc data, drop missing observations, apply log transformation to doc value
df_ann_epi_fit <- df_doc %>%
filter(season == 0) %>%
select(-contains(c("surf", "meso", "bathy"))) %>%
left_join(df_ann_epi_pred, by = join_by(lon, lat, season)) %>%
drop_na() %>%
mutate(log_doc_epi = log(doc_epi), .before = doc_epi)Plot map of coverage.
# Plot coverage of data
# points are model fitting data
# blue rectangles are the prediction area
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_ann_epi_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_ann_epi_fit, aes(x = lon, y = lat, colour = log_doc_epi), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
ggtitle("Epi annual")Plot doc distributions, after log-transformation
ggplot(df_ann_epi_fit) + geom_histogram(aes(x = log_doc_epi)) + ggtitle("Epi annual")OK-ish distribution.
Mesopelagic annual
## New predictions data
# Env data in the surface + epipelagic + mesopelagic layer, annual, to be used for new predictions. NB: this requires droping any row with missing values
df_ann_meso_pred <- df_env %>%
filter(season == 0) %>%
select(-contains("bathy")) %>%
left_join(df_inc %>% select(lon, lat, meso_include), by = join_by(lon, lat)) %>%
filter(meso_include) %>%
select(-meso_include)
# Detect pixels where less that 1/4 of predictors are missing
sel_pix <- df_ann_meso_pred %>%
pivot_longer(temperature_surf:nitrate_meso, names_to = "variable", values_to = "value") %>%
group_by(lon, lat, season) %>%
summarise_all(list(percent_na)) %>%
ungroup() %>%
rename(percent_na = value) %>%
filter(percent_na < 0.10) %>%
select(lon, lat, season) %>%
mutate(use = TRUE)
# Keep only these pixels
df_ann_meso_pred <- df_ann_meso_pred %>%
left_join(sel_pix, by = join_by(lon, lat, season)) %>%
filter(use) %>%
select(-use)
## Model fitting data
# Join env data with doc data, drop missing observations, apply log transformation to doc value
df_ann_meso_fit <- df_doc %>%
filter(season == 0) %>%
select(-contains(c("surf", "epi", "bathy"))) %>%
left_join(df_ann_meso_pred, by = join_by(lon, lat, season)) %>%
drop_na() %>%
mutate(log_doc_meso = log(doc_meso), .before = doc_meso)Plot map of coverage.
# Plot coverage of data
# points are model fitting data
# blue rectangles are the prediction area
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_ann_meso_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_ann_meso_fit, aes(x = lon, y = lat, colour = log_doc_meso), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
ggtitle("Meso annual")Plot doc distributions, after log-transformation
ggplot(df_ann_meso_fit) + geom_histogram(aes(x = log_doc_meso)) + ggtitle("Meso annual")OK-ish distribution.
Bathypelagic annual
## New predictions data
# Env data in the surface + mesopelagic + bathypelagic layer, annual, to be used for new predictions. NB: this requires droping any row with missing values
df_ann_bathy_pred <- df_env %>%
filter(season == 0) %>%
left_join(df_inc %>% select(lon, lat, bathy_include), by = join_by(lon, lat)) %>%
filter(bathy_include) %>%
select(-bathy_include)
# Detect pixels where less that 1/4 of predictors are missing
sel_pix <- df_ann_bathy_pred %>%
pivot_longer(temperature_surf:nitrate_bathy, names_to = "variable", values_to = "value") %>%
group_by(lon, lat, season) %>%
summarise_all(list(percent_na)) %>%
ungroup() %>%
rename(percent_na = value) %>%
filter(percent_na < 0.10) %>%
select(lon, lat, season) %>%
mutate(use = TRUE)
# Keep only these pixels
df_ann_bathy_pred <- df_ann_bathy_pred %>%
left_join(sel_pix, by = join_by(lon, lat, season)) %>%
filter(use) %>%
select(-use)
## Model fitting data
# Join env data with doc data, drop missing observations, apply log transformation to doc value
df_ann_bathy_fit <- df_doc %>%
filter(season == 0) %>%
select(-contains(c("surf", "epi", "meso"))) %>%
left_join(df_ann_bathy_pred, by = join_by(lon, lat, season)) %>%
drop_na() %>%
mutate(log_doc_bathy = log(doc_bathy), .before = doc_bathy)Plot map of coverage.
# Plot coverage of data
# points are model fitting data
# blue rectangles are the prediction area
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_ann_bathy_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_ann_bathy_fit, aes(x = lon, y = lat, colour = log_doc_bathy), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
ggtitle("Bathy annual")Plot doc distributions, after log-transformation
ggplot(df_ann_bathy_fit) + geom_histogram(aes(x = log_doc_bathy)) + ggtitle("Bathy annual")OK-ish distribution.
Remove extreme values
The very high DOC values in the surface layer are found at high latitudes in the north (Kara Sea and Laptev Sea). This is because in these areas there are strong river discharge bringing a lot of DOC into the sea, thus these values are not representative of oceanic processes. Let’s remove them.
Kara Sea: 50-100° E, 70-80°N
Leptev Sea: 100-150° E, 70-80°N
Set DOC values to NA in these areas.
# Annual surface data
df_ann_surf_fit <- df_ann_surf_fit %>%
mutate(
log_doc_surf = ifelse(between(lon, 50, 100) & between(lat, 70, 80), NA, log_doc_surf),
log_doc_surf = ifelse(between(lon, 100, 150) & between(lat, 70, 80), NA, log_doc_surf),
doc_surf = ifelse(between(lon, 50, 100) & between(lat, 70, 80), NA, doc_surf),
doc_surf = ifelse(between(lon, 100, 150) & between(lat, 70, 80), NA, doc_surf)
) %>%
drop_na()
# Seasonal surface data
df_seas_surf_fit <- df_seas_surf_fit %>%
mutate(
log_doc_surf = ifelse(between(lon, 50, 100) & between(lat, 70, 80), NA, log_doc_surf),
log_doc_surf = ifelse(between(lon, 100, 150) & between(lat, 70, 80), NA, log_doc_surf),
doc_surf = ifelse(between(lon, 50, 100) & between(lat, 70, 80), NA, doc_surf),
doc_surf = ifelse(between(lon, 100, 150) & between(lat, 70, 80), NA, doc_surf)
) %>%
drop_na()Plot maps
ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_ann_surf_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_ann_surf_fit, aes(x = lon, y = lat, colour = log_doc_surf), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
ggtitle("Surface annual") +
coord_quickmap(expand = 0)ggplot() +
geom_polygon(data = world, aes(x = lon, y = lat, group = group), fill = "grey") +
geom_raster(data = df_seas_surf_pred, aes(x = lon, y = lat), fill = "steelblue3", alpha = 0.5) +
geom_point(data = df_seas_surf_fit, aes(x = lon, y = lat, colour = log_doc_surf), size = 0.5) +
ggplot2::scale_colour_viridis_c(option = "A") +
coord_quickmap(expand = 0) +
ggtitle("Surface seasonal") +
facet_wrap(~season)Let’s replot DOC distributions.
ggplot(df_ann_surf_fit) + geom_histogram(aes(x = log_doc_surf), bins = 100) + ggtitle("Surface annual")ggplot(df_seas_surf_fit) + geom_histogram(aes(x = log_doc_surf), bins = 100) + facet_wrap(~season) + ggtitle("Surface seasonal")Not perfect, but not too bad for now. Will likely require more filtering.
Save everything
For each prediction task (surface annual, surface seasonal, meso annual and bathy annual) two datasets are saved:
one with both DOC and env data at locations where DOC is available: this will be used to fit the model.
one with global map of env data, to predict DOC at the global scale
save(df_ann_surf_fit, df_ann_surf_pred, file = "data/02.ann_surf.Rdata")
save(df_seas_surf_fit, df_seas_surf_pred, file = "data/02.seas_surf.Rdata")
save(df_ann_epi_fit, df_ann_epi_pred, file = "data/02.ann_epi.Rdata")
save(df_ann_meso_fit, df_ann_meso_pred, file = "data/02.ann_meso.Rdata")
save(df_ann_bathy_fit, df_ann_bathy_pred, file = "data/02.ann_bathy.Rdata")